/*******************************************************************************
																				
	DESCRIPTION: 	This do file creates the datasets that are to be fed into R 
					to create the ensemble predictions. 
					
					This dataset saves the data for every year separately.
				
*******************************************************************************/

clear all
global id_code 002

************************************************************************
* 1. Preparation to split the dataset
************************************************************************
* Importing data
use "${data}/001_9_FinalMainDataset.dta", clear
drop L_Occupation*

* Generating random variable so that training sample is a random sample
set seed 2111
gen double random = runiform()
isid random // Ensure random gives a unique ordering of data
sort random
drop random
gen n = _n

* Creating dummy variables for non-ordinal categorical variables
foreach var in L_Industry_3digit L_civilStatus EducLevel L_Municipality L_Age_Youngest citizenship migrationCohort {
	tab `var', gen(`var'_dummy)
	drop `var'
}

************************************************************************
* 2. Sample preparation for each model
************************************************************************
pause off
foreach model in Full {

		************************************************************************
		/* Define the year span and set of variables for each model
		(using macros set in 0001_init.do) */
		************************************************************************
		
		if "`model'"=="Full" {
			local sampleName Full
			local yearSpan 1992/2016
			local varSet "$demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist"
		}
		
		if "`model'"=="Full_F" {
			local sampleName Full_F
			local yearSpan 1992/2016
			local varSet "$demoEdu $mun $incIndiv $incOther $incHist $indu $emplHist"
		}
		
		if "`model'"=="Full_A" {
			local sampleName Full_A
			local yearSpan 2006/2006
			local varSet "$demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist"
		}
		
		if "`model'"=="Full_NT" {
			local sampleName Full_NT
			local yearSpan 2006/2006
			local varSet "$demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist"
		}
		
		if "`model'"=="Full_LT" {
			local sampleName Full_LT
			local yearSpan 2006/2006
			local varSet "$demoEdu $migHist $mun $incIndiv $incOther $incHist $indu $emplHist any_LT* "
		}	
		
		************************************************************************
		* Keep only observations from a particular year (note, this isn't done for months 1, 2, 4, 5, etc.)		************************************************************************

		forval y = `yearSpan' {

			preserve

			gen temp = 0
			
			forvalues i = 0(3)12{

				replace emplAft3M_`i'M_In = . if !inrange(startU + `i'*30, d(01jan`y'),	///
					d(31dec`y'))
				replace emplAft6M_`i'M_In = . if !inrange(startU + `i'*30, d(01jan`y'),	///
					d(31dec`y'))
					
				replace temp = 1 if emplAft3M_`i'M_In!=.
			}	
			
			replace emplAft1M_0M_In = . if !inrange(startU, d(01jan`y'), d(31dec`y'))
			
			replace emplAft12M_0M_In = . if !inrange(startU, d(01jan`y'), d(31dec`y'))
			
			drop if temp==0
			drop temp
			
			************************************************************************
			* Keep only observations and variables that belong to the sample
			************************************************************************
			* This is indicated by variable inSample_`sampleName' generate in code 001_09
			keep if inSample_`sampleName' == 1
			pause
			
			* Keep only variables that belong to the model
			keep LopNr_PersonNr InLnr `varSet' inSample_`sampleName' emplAft* n 
			pause
			
			************************************************************************
			* Prepare and save the sample
			************************************************************************

			* Generate variable used to split dataset into parameter tuning, training
			* and prediction partitions.
			sort n
			gen n_order = _n	

			rename inSample_`sampleName' inSample
			
			* The last variables should be covariates starting from gender 
			* this is used to tell R which variables should be included in the model
			order LopNr_PersonNr InLnr n n_order inSample emplAft* Gender, first

			compress
			
			save "${data}/${id_code}_DataForR_`model'_`y'.dta", replace
		
			restore
	
		}
}


************************************************************************
* 3. Creating sub-models by cumulatively adding variables
************************************************************************
foreach model in Full {
	
	************************************************************************
	* Define the sets of variables that are dropped  to create sub-models
	************************************************************************
	
	if "`model'"=="Full" {
		local varSet "mun indu migHist incHist emplHist incOther incIndiv"
	}
	
	if "`model'"=="Full_F" {
		local varSet "mun indu incHist emplHist incOther incIndiv"
	}
	
	************************************************************************
	* Consecuitively drop sets of variables (using macros set in 0001_init.do)
	************************************************************************
	
	foreach y in 2006 {
		
		use "${data}/${id_code}_DataForR_`model'_`y'.dta", replace
	
		foreach varDrop in `varSet' {
		
			drop $`varDrop'
			save "${data}/${id_code}_DataForR_`model'_SeqDrop_`varDrop'_`y'.dta", replace
		
		}
		
		* Finally, create models dropping past unemployment information:
		use "${data}/${id_code}_DataForR_`model'_`y'.dta", replace
		
		preserve
			drop DaysUnemp_2Years* DaysUnemp_5Years* 
			save "${data}/${id_code}_DataForR_`model'_DropPastSpells_DaysUnemp_`y'.dta", replace
		restore 
		
		preserve
			drop unemplSpells5Ybefore unemplSpells2Ybefore 
			save "${data}/${id_code}_DataForR_`model'_DropPastSpells_unemplSpells_`y'.dta", replace
		restore 

		preserve
			drop DaysUnemp_2Years* DaysUnemp_5Years* unemplSpells5Ybefore unemplSpells2Ybefore
			save "${data}/${id_code}_DataForR_`model'_DropPastSpells_Both_`y'.dta", replace
		restore 

		
	}
}

************************************************************************
* 4. Creating sub-models by adding one set of variables at a time
************************************************************************
foreach model in Full {
	
	************************************************************************
	* Define the sets of variables that are dropped to create sub-models
	************************************************************************
	
	if "`model'"=="Full" {
	local varSet "mun indu migHist incHist emplHist incOther incIndiv"
	}
	
	************************************************************************
	* Keep demographics + one set of variables (using macros set in 0001_init.do)
	************************************************************************
	
	foreach y in 2006 {
		
		use "${data}/${id_code}_DataForR_`model'_`y'.dta", replace
	
		foreach varKeep in `varSet' {
			preserve
			keep LopNr_PersonNr InLnr inSample emplAft* n n_order $demoEdu $`varKeep'
			save "${data}/${id_code}_DataForR_`model'_Marg_`varKeep'_`y'.dta", replace
			restore
		}
	}
}
